#==============================================================================#
# 1-extract-profiles-bd.R
# Matt Curtis 27/06/22
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
# pull all french records from the huge profiles file 
# this gives us births and deaths
# updated feb 1 2023
# updated may 29, 2025
#==============================================================================#
library(arrow)
library(tidytable)
library(stringr)

#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# set working directory once
setwd("~/Replication package")

#------------------------------------------------------------------------------#

# list of terms to flag as french
french_good <-
  c(
    "france",
    "fr",
    "francia",
    "frankreich",
    "franc",
    "franca",
    "francja",
    "frankrijk",
    "fra",
    "franciaorszag",
    "fran",
    "frankrike",
    "franche"
  )
french_bad <-
  c("guiana",
    "guyane",
    "nouvelle france",
    "new france",
    "canada")
#------------------------------------------------------------------------------#
# find the number of rows
# only do once!
#geni<-fread("./1_raw_data/1_2_geni/geni_profiles_patched.csv",
#            showProgress=TRUE,
#            select = c(1))
#print(nrow(geni))

# = 153009891

#------------------------------------------------------------------------------#
# split the data in to chunks

chunk_n<-0
max_chunk<-15

while(chunk_n<=max_chunk){
  print(paste0("Reading chunk ",chunk_n+1,"/",max_chunk+1))
  fname<-paste0("./1_raw_data/1_2_geni/1_2_geni_chunked/geni_profiles/part-",chunk_n,".parquet")
  geni<-read_parquet(fname, col_select = c(1, # profile id
               13,19,20,21,22,23, # birth
               27,33,34,35,36,37, # baptism one changed to 27, why?
               41,48,49,50,51,52, # death
               56,62,63,64,65,66 # burial
    )) %>% 
    tidytable()
  gc()
  
  # time period: born 1700-1850 or no birth and baptised 1700-1850
  geni <-geni %>% 
    mutate(good_birth = (birth_start_year >= 1600 &
                           birth_start_year <= 1850)) %>% 
    mutate(good_bap = (baptism_start_year >= 1600 &
                         baptism_start_year <= 1850)) %>% 
    filter(good_birth|(is.na(birth_start_year)&good_bap))
  gc()
  
  #----------------------------------------------------------------------------#
  print(paste0("Combining locations: ",chunk_n+1,"/",max_chunk+1))
  
  # clean location strings
  geni <- geni %>%
    mutate(
      across(
        c(
          ends_with("_place_name"),
          ends_with("_city"),
          ends_with("_county"),
          ends_with("_state"),
          ends_with("_country")
        ),
        ~ . %>%
          iconv(to = 'ASCII//TRANSLIT') %>% # remove accents
          str_to_lower() %>%
          str_replace_all("[^[a-z]+]", " ") %>% # remove everything but letters
          str_squish() %>% # but keep white space
          replace(is.na(.), "")
      )
    )
  
  
  # paste together all the locations
  geni <- geni %>%
    mutate(
      birth_loc = paste(
        birth_city,
        birth_county,
        birth_state,
        birth_country,
        birth_place_name,
        sep = ", "
      ) %>%
        str_remove_all(", , ") %>%
        str_remove_all("^, ")  %>%
        str_remove_all(", $")
    ) %>% 
    mutate(
      baptism_loc = paste(
        baptism_city,
        baptism_county,
        baptism_state,
        baptism_country,
        baptism_place_name,
        sep = ", "
      ) %>%
        str_remove_all(", , ") %>%
        str_remove_all("^, ")  %>%
        str_remove_all(", $")
    ) %>% 
    mutate(
      death_loc = paste(
        death_city,
        death_county,
        death_state,
        death_country,
        death_place_name,
        sep = ", "
      ) %>%
        str_remove_all(", , ") %>%
        str_remove_all("^, ")  %>%
        str_remove_all(", $")
    ) %>% 
    mutate(
      burial_loc = paste(
        burial_city,
        burial_county,
        burial_state,
        burial_country,
        burial_place_name,
        sep = ", "
      ) %>%
        str_remove_all(", , ") %>%
        str_remove_all("^, ")  %>%
        str_remove_all(", $")
    )
  
  #----------------------------------------------------------------------------#
  print(paste0("Donating from baptisms and burials: ",chunk_n+1,"/",max_chunk+1))
  
  # baptism replace missing births
  geni <- geni %>%
    mutate(bapl_flag = birth_loc == "" & baptism_loc != "") %>%
    mutate(birth_loc = case_when(bapl_flag ~ baptism_loc,
                                 TRUE ~ birth_loc)) 
  
  # burials replace missing deaths
  geni <- geni %>%
    mutate(burl_flag = death_loc == "" & burial_loc != "") %>%
    mutate(death_loc = case_when(burl_flag ~ burial_loc,
                                 TRUE ~ death_loc)) 
  
  #----------------------------------------------------------------------------#
  # merge together all the locations
  
  birth_loc <- select(geni, loc = birth_loc) %>%
    distinct() %>%
    filter(loc != "")
  
  death_loc <- select(geni, loc = birth_loc) %>%
    distinct() %>%
    filter(loc != "")
  
  locs <- bind_rows(birth_loc, death_loc) %>% 
    distinct() %>%
    mutate(n = row_number())
  
  #----------------------------------------------------------------------------#
  # detect french words
  # this step is slow
  print(paste0("Detecting french words: ",chunk_n+1,"/",max_chunk+1))
  
  # good terms are the terms we want (e.g. France)
  # bad are the ones we do not (e.g. New France)
  
  geni <- mutate(geni, born_france = 0) %>%
    mutate(died_france = 0)
  
  for (word in french_good) {
    #\b is regex for word boundary
    geni <- mutate(geni,
                   born_france = replace(born_france,
                                         str_detect(birth_loc, paste0(
                                           "\\b(", word, ")\\b"
                                         )), 1)) 
    geni <- mutate(geni,
                   died_france = replace(died_france,
                                         str_detect(death_loc, paste0(
                                           "\\b(", word, ")\\b"
                                         )), 1))
  }
  
  
  rm(pb)
  gc()
  
  for (word in french_bad) {
    #\b is regex for word boundary
    geni <- mutate(geni,
                   born_france = replace(born_france,
                                         str_detect(birth_loc, paste0(
                                           "\\b(", word, ")\\b"
                                         )), 0)) 
    geni <- mutate(geni,
                   died_france = replace(died_france,
                                         str_detect(death_loc, paste0(
                                           "\\b(", word, ")\\b"
                                         )), 0))
  }
  
  geni <- select(
    geni,-matches(coll("baptism")),
    -matches(coll("last_residence")),
    -matches(coll("burial"))
  )
  
  #----------------------------------------------------------------------------#
  print(paste0("Saving: ",chunk_n+1,"/",max_chunk+1))
  
  geni <-geni %>% 
    filter(born_france == 1 | died_france == 1 ) %>% 
    select(profile_id,born_france,died_france)
  write_parquet(geni, paste0("./2_scripts/2_0_tempfiles/fr-chunk_",chunk_n,"_",max_chunk,".parquet"))
  gc()
  
  # prepare for next round
  chunk_n<-chunk_n+1
}

#------------------------------------------------------------------------------#
# reassemble parts

files<- list.files(path="./2_scripts/2_0_tempfiles",pattern = "fr-chunk.*parquet")
output<-data.table()
for(file in files){
  print(file)
  temp<-read_parquet(paste0("./2_scripts/2_0_tempfiles/",file)) #,integer64="character")
  output<-bind_rows(output,temp)
}

write_parquet(output,'./2_scripts/2_0_tempfiles/fr-pids-bd.parquet')
rm(temp,output)
gc()
